%% This is file `elsarticle-template-3-num.tex',
%%
%% Copyright 2009 Elsevier Ltd
%%
%% This file is part of the 'Elsarticle Bundle'.
%% ---------------------------------------------
%%
%% It may be distributed under the conditions of the LaTeX Project Public
%% License, either version 1.2 of this license or (at your option) any
%% later version.  The latest version of this license is in
%%    http://www.latex-project.org/lppl.txt
%% and version 1.2 or later is part of all distributions of LaTeX
%% version 1999/12/01 or later.
%%
%% The list of all files belonging to the 'Elsarticle Bundle' is
%% given in the file `manifest.txt'.
%%
%% Template article for Elsevier's document class `elsarticle'
%% with numbered style bibliographic references
%%
%% $Id: elsarticle-template-3-num.tex 165 2009-10-08 07:58:10Z rishi $
%% $URL: http://lenova.river-valley.com/svn/elsbst/trunk/elsarticle-template-3-num.tex $
%%

% SUBMISSION: USE PREPRINT, FOR AN IDEA OF FINAL FORMAT USE FINAL
\documentclass[preprint,12pt]{elsarticle}
%\documentclass[final,3p,times,twocolumn]{elsarticle}

%% Use the option review to obtain double line spacing
%% \documentclass[preprint,review,12pt]{elsarticle}

%% Use the options 1p,twocolumn; 3p; 3p,twocolumn; 5p; or 5p,twocolumn
%% for a journal layout:
%% \documentclass[final,1p,times]{elsarticle}
%% \documentclass[final,1p,times,twocolumn]{elsarticle}
%% \documentclass[final,3p,times]{elsarticle}
%% \documentclass[final,3p,times,twocolumn]{elsarticle}
%% \documentclass[final,5p,times]{elsarticle}
%% \documentclass[final,5p,times,twocolumn]{elsarticle}

%% if you use PostScript figures in your article
%% use the graphics package for simple commands
%% \usepackage{graphics}
%% or use the graphicx package for more complicated commands
%% \usepackage{graphicx}
%% or use the epsfig package if you prefer to use the old commands
%% \usepackage{epsfig}

%% The amssymb package provides various useful mathematical symbols
\usepackage{amssymb}
%% The amsthm package provides extended theorem environments
%% \usepackage{amsthm}

%% The numcompress package shorten the last page in references.
%% `nodots' option removes dots from firstnames in references.
\usepackage[nodots]{numcompress}

%% The lineno packages adds line numbers. Start line numbering with
%% \begin{linenumbers}, end it with \end{linenumbers}. Or switch it on
%% for the whole article with \linenumbers after \end{frontmatter}.
%% \usepackage{lineno}

\usepackage{subfig}
\usepackage{linguex}
\usepackage{color}
\usepackage{comment}
\usepackage{multirow}
\usepackage{url}

%% natbib.sty is loaded by default. However, natbib options can be
%% provided with \biboptions{...} command. Following options are
%% valid:

%%   round  -  round parentheses are used (default)
%%   square -  square brackets are used   [option]
%%   curly  -  curly braces are used      {option}
%%   angle  -  angle brackets are used    <option>
%%   semicolon  -  multiple citations separated by semi-colon
%%   colon  - same as semicolon, an earlier confusion
%%   comma  -  separated by comma
%%   numbers-  selects numerical citations
%%   super  -  numerical citations as superscripts
%%   sort   -  sorts multiple citations according to order in ref. list
%%   sort&compress   -  like sort, but also compresses numerical citations
%%   compress - compresses without sorting
%%
%% \biboptions{comma,round}

% \biboptions{}


\journal{Journal of Biomedical Informatics}

\hyphenation{bio-lexi-con}

\begin{document}

\begin{frontmatter}

%% Title, authors and addresses

%% use the tnoteref command within \title for footnotes;
%% use the tnotetext command for the associated footnote;
%% use the fnref command within \author or \address for footnotes;
%% use the fntext command for the associated footnote;
%% use the corref command within \author for corresponding author footnotes;
%% use the cortext command for the associated footnote;
%% use the ead command for the email address,
%% and the form \ead[url] for the home page:
%%
%% \title{Title\tnoteref{label1}}
%% \tnotetext[label1]{}
%% \author{Name\corref{cor1}\fnref{label2}}
%% \ead{email address}
%% \ead[url]{home page}
%% \fntext[label2]{}
%% \cortext[cor1]{}
%% \address{Address\fnref{label3}}
%% \fntext[label3]{}

% \title{Verb Subcategorization in Biomedicine: Evaluation and Investigation}
\title{Acquisition and Evaluation of Verb Subcategorization Resources for Biomedicine}

% notes/q's for t&a: title ok?

% An investigation of challenges in the automatic acquisition of verb subcategorization information in the biomedical literature

%% use optional labels to link authors explicitly to addresses:
%% \author[label1,label2]{<author name>}
%% \address[label1]{<address>}
%% \address[label2]{<address>}

\author[cam]{Laura Rimell}
\ead{Laura.Rimell@cl.cam.ac.uk}

\author[cam]{Thomas Lippincott}
\ead{Thomas.Lippincott@cl.cam.ac.uk}

\author[aus]{Karin Verspoor}
\ead{karin.verspoor@nicta.com.au}

\author[col]{Helen L. Johnson}
\ead{helen.johnson@ucdenver.edu}

\author[cam]{Anna Korhonen}
\ead{Anna.Korhonen@cl.cam.ac.uk}

% Anna: double check the email addresses before final version

\address[cam]{Computer Laboratory, University of Cambridge, 15 JJ Thomson Avenue, Cambridge CB3 0FD, UK}
\address[aus]{National ICT Australia, Victoria Research Lab, Melbourne VIC 3010, Australia}
\address[col]{Department of Pharmacology, Center for Computational Pharmacology,
University of Colorado School of Medicine, Aurora, Colorado, Denver, CO, USA}

\begin{abstract}
%% Text of abstract

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

This paper presents three new resources related to verb
subcategorization frames (SCF) in biomedicine.
%,  including 
We present
the first
biomedical SCF gold standards, capturing two different but widely-used definitions of
subcategorization, and suitable for evaluating automatically acquired
SCF resources in biomedicine.
%,  and 
We also present
a new SCF lexicon acquired
automatically from
% across the entire 
the PubMed Open Access collection (PMC OA), covering a large number of biomedical subdomains. The resources are made publicly available. 
% Though SCF
% information is important for information extraction, there has been no
% previous quantitative evaluation of SCF resources.  

The gold standards, containing SCF frequencies for 30 verbs and 10 verbs, respectively, make possible the
quantitative and qualitative evaluation of SCF lexicons in
biomedicine, which has not previously been undertaken despite the
importance of subcategorization for 
biomedical Natural Language Processing (NLP).
% information extraction. 
The new lexicon, BioCat, is acquired automatically from PMC OA using state of
the art tools developed for general language SCF acquisition. We compare the accuracy of BioCat
to that of the BioLexicon, the only previously
%  We compare the
% performance of the BioLexicon -- the only previously 
existing
automatically-acquired SCF lexicon for biomedicine, which was
extracted from corpus data in the E. Coli subdomain using NLP technology adapted to the subdomain of molecular biology.
%  -- with BioCat, a new lexicon which we
% have acquired automatically from the entire PubMed Open Access
% collection (PMC OA) using SCF acquisition tools developed for general
% language. 
Our results show that the BioLexion has greater precision
while BioCat has better coverage of SCFs. Although the BioLexicon
shows better overall 
% performance
accuracy, this 
% performance 
accuracy is 
still
considerably
lower than that reported with general English lexicons,
demonstrating the need for domain- and subdomain-specific SCF acquisition tools for biomedicine.
% domain adaptation in SCF acquisition.

% We also provide the first quantitative exploration of different definitions of subcategorization for biomedicine, through the introduction of an additional, smaller gold standard resource. 
It is well known that the standard definition of subcategorization in biomedicine is different than for general language, since it includes more adjuncts (modifiers) as part of the SCF. 
% However, the implications of the different definitions for system
% accuracy have not been quantitatively assessed. Through the
% introduction of an additional, smaller gold standard resource, we
% provide the first such assessment. 
Our new gold standards reveal major differences 
% Cross-resource comparison reveals major differences 
when annotators use the different definitions.
% We also provide the first quantitative evaluation of the implications of two 
% definitions of subcategorization, based on whether the argument-adjunct distinction
% is maintained. 
Moreover, evaluation of
% an automatically acquired SCF lexicon 
BioCat
% against the two different gold standards 
% reveals major 
% performance
yields major differences in 
accuracy
% differences 
depending on the gold standard, demonstrating that the definition of subcategorization adopted will have a direct
impact on perceived
system 
accuracy.
% performance.


\end{abstract}

\begin{keyword}
verb subcategorization \sep lexical resources \sep natural language processing \sep biomedical text processing

%% keywords here, in the form: keyword \sep keyword

%% MSC codes here, in the form: \MSC code \sep code
%% or \MSC[2008] code \sep code (2000 is the default)

\end{keyword}

\end{frontmatter}

%%
%% Start line numbering here if you want
%%
% \linenumbers

%% main text
\section{Introduction}
 \label{intro}

{\it Verb subcategorization}
refers to the tendency of verbs to ``select'' co-occurrence with certain
phrase types. 
% For example, the verb {\it decrease} can
% be transitive or intransitive ({\it isoguvacine decreased the number of spikes} and {\it the number of spikes decreased} are both fully grammatical), while {\it
% compare} cannot ({\it we compared the ratio} is fully grammatical, but {\it the ratio compared} is not). 
For example, the verb {\it detect} can be transitive (taking a single direct object) or it can take a clausal complement: {\it A routine x-ray of the thorax detected [$_{NP}$pneumonia]} and 
{\it Researchers have detected [$_{S}$that the tissues have high levels of Wnt signaling components]} are both fully grammatical sentences. In contrast, the verb {\it examine} can be transitive, but cannot take clausal complements: {\it The study examined [$_{NP}$the relationship between ankle-brachial index and stroke]} is fully grammatical, but not {\it The study examined [$_{S}$that there is a relationship between ankle-brachial index and stroke]}.

Any natural language processing (NLP) application that makes use of
predicate-argument structure can make use of subcategorization
information. {\it Subcategorization frames} (SCFs) were recently used
by \citep{ananiadou:10,rupp:10} to improve event extraction from
UKPubMedCentral documents. SCFs have the potential for wide
application in many other tasks such as entailment detection, relation
extraction, syntactic and semantic parsing, all of which are important in biomedical NLP. Moreover, SCF information
is more easily acquired from text corpora than similar linguistic structures used in biomedicine, such as Predicate
Argument Structures \cite{review}.

% but SCF information has not yet been very
% widely used in biomedicine, but it has the potential for wide
% application, since it is useful for information extraction but more
% easily acquired than Predicate Argument Structures (see \cite{review}
% for a review of subcategorization in biomedicine).

In \cite{review} we reviewed the state of the art with regard to verb
subcategorization for biomedicine. We observed that there are a
limited number of existing biomedical verb SCF resources, and to date
their development has relied on either introspective, manual collation
of SCFs, which results in resources that lack coverage, or automatic
identification of SCFs using tools adapted to a single biomedical
subdomain. Adaptation of such tools is labor-intensive, and the
resulting resources may still lack coverage because the tools are not
adapted to the broader biomedical literature. 
We showed 
% in \cite{review} 
that biomedical
subdomains show notable and complex variation in verb
subcategorization behavior, highlighting the need for
minimally-supervised tools to automatically acquire SCF information,
since such tools can be applied to different subdomains with minimal
manual intervention.  

Moreover, we observed in \cite{review} that the quality of existing verb SCF resources for biomedicine is unknown, due to a lack of available gold standards which can be used for evaluation.
% so the quality of the state of the art resources is noted
% in \cite{review} that there have been no gold standards available to
% evaluate biomedical verb SCF resources, so the quality of existing
% resources is unknown.  
% Moreover, 
The effect of 
% using 
adopting
a more lenient
definition of subcategorization compared to the traditional linguistic definition, as is typically done in biomedical NLP,
has also not been
ascertained due to the lack of gold standards.

% In \cite{review} we called for three new methods to help assess the
% state of the art and advance the field of SCF acquisition in
% biomedicine: (1) automatic methods for SCF acquisition in biomedicine,
% since they will help remove dependence on tools manually adapted to
% particular subdomains; (2) quantitative evaluation of acquired
% subcategorization lexicons in biomedicine; and (3) quantitative
% investigation of different definitions of subcategorization which have
% been previously used in biomedicine and other NLP fields. In this
% paper we begin to answer the call. 
% % First, after a brief summary of subcategorization on Secton~\ref{subcat}, we introduce two new
% % resources. Section~\ref{gold} 

In this paper we present three new resources for biomedical SCF acquisition.
% , and introduce new evaluations to assess the state of the art.  
% First 
We introduce 
% two new gold standards,
%  for subcategorization in biomedicine, 
the first gold standards for SCF in biomedicine,
drawn from across a variety of
subdomains and capturing the different definitions of subcategorization. 
% These novel gold standards make it possible for the first time to perform quantitative and qualitative evaluation of biomedical SCF resources, and to assess the impact of different definitions of subcategorization.
% Section~\ref{subcat_system} 
We also introduce a new, automatically
acquired biomedical SCF lexicon, BioCat, created by using an SCF
acquisition system originally developed for general language and
applied to a large corpus covering a wide variety of biomedical
subdomains.
%  created by using a toowith a general-language tool. 
We use our gold standard to
evaluate BioCat and the BioLexicon \cite{sasaki:08,biolexicon:2008,biolexicon,biolexicon_new}, a previously existing but not previously evaluated SCF resource.
% , constituting the first quantitative evaluation of
% subcategorization acquisition in biomedicine.
% , in Section~\ref{subcat_system}. 
The two resources, BioCat and the BioLexicon, provide an interesting
contrast since the former is built using tools unadapted to biomedicine and the latter built using tools adapted to a particular subdomain. 
%  since one is highly tuned for a particular subdomain and the
% other is not tuned at all to the biomedical domain.
% , in Section~\ref{}. 
% We also use our two contrasting gold standards to perform
% % Finally, we introduce a second gold standard which
% % allows us to do 
% the first quantitative and qualiative investigation of
% definitions of subcategorization 
% % definitions 
% in biomedicine.
We find that the BioLexicon has a greater precision while BioCat has better coverage of SCFs, but that the overall accuracy of both resources is still below that which has been attained for SCF resources in general language. 
For BioCat, we introduce a new method for filtering hypothesized SCFs that improves accuracy by drawing on knowledge of subcategorization tendencies in general language.
Finally, we find that the definition chosen for subcategorization
% , whether the more restrictive definition used in traditional linguistics or the more lenient definition typically chosen for biomedicine, 
has a notable effect on 
the resulting gold standards
% the resulting resource 
and on the perceived accuracy of SCF acquisition systems.

The rest of the paper is organized as follows. Section~\ref{subcat} provides a brief overview of subcategorization in biomedicine. Section~\ref{gold} introduces the new gold standards, and Section~\ref{subcat_system} introduces the new SCF lexicon, BioCat. The evaluation methodology is described in Section~\ref{methods} and the results in Section~\ref{results}.

% For this purpose we manually
% annotate a SCF gold standard, comprising 30 verbs with
% subcategorization frames and frequencies, using data from across the PubMed Open Access collection (PMC
% OA) \citep{PMC:09}. This is the first gold standard suitable for evaluating statistical 
% SCFs systems in biomedicine. We compare two automatically acquired SCF lexicons against this gold standard.  The first is the BioLexicon, which, as described above, was acquired using
% components individually adapted to subdomains of biomedicine, and
% applied to a corpus representing a particular subdomain. The second lexicon is the BioCat, 
% a novel large SCF lexicon which we acquired from the PMC OA corpus (representing 
% the largest such corpus to be used in automatic SCF acquisition\footnote{Although e.g.~the
% SPECIALIST lexicon has broad coverage of subdomains, it is less
% comprehensive due to being manually compiled.}, 
% using a SCF system developed at the University of Cambridge 
% \citep{preiss:07}. The Cambridge system was previously used for general language, 
% and we have applied it to a biomedical corpus without any domain adaptation. 
% This paper
% represents the first evaluation and comparison of SCF systems in biomedicine and
% allows us to gain insight into how current technology performs and how
% SCF information should ideally be acquired.

% Second, we explore the meaning of subcategorization,
% particularly in terms of the traditional argument-adjunct distinction 
% and the role of highly selected adjuncts which have been sometimes 
% included as part of SCFs in biomedicine. We
% manually annotate two SCF gold standards corresponding to two different
% definitions of subcategorization, to determine
% their impact on the overall shape of the gold standard and the
% accuracy of a subcategorization acquisition system when applied to
% biomedicine.

% Overall, our investigation seeks to provide the field of biomedical
% information processing with a much-needed baseline representing the
% current state of the art in SCF acquisition for biomedical text and an 
% understanding of how such systems should be developed further.
% Previous work with large datasets for SCF aquisition has focused on
% general language, so this investigation contributes towards our
% knowledge of how to build domain-specific systems. We find that 
% existing SCF systems suffer performance degradation in
% biomedicine compared to general language. The more
% labor-intensive system manually adapted to a subdomain of biomedicine 
% suffers most in recall, while the unadapted general language system 
% suffers most in precision. Neither system reaches the impressive
% performance level seen with previous experiments on general English.
% In addition, we find that the
% treatment of the argument-adjunct distinction has a major effect on
% the ultimate shape of the resulting lexicon, and consequently on
% measured performance of SCF acquisition systems. 
% Taken together, these points
% suggest the need for minimally supervised domain adaptation 
% which can be applied easily (i.e.~without substantial manual effort) 
% to the entire biomedical domain as well as to different subdomains, as
% required. We discuss future approaches, and release the resources 
% presented here with the article (our different 
% gold standards, along with the large BioCat lexicon) so that they 
% can benefit further work in this area.

% The companion paper also pointed out that there is wide subdomain
% variation in subcategorization behaviour, and that automatic resource
% acquisition is needed for this reason.  Although we do not investigate subdomain variation specifically in this paper, the methods are suitable.  This is particularly interesting when we consider how much tuning is done on the tools that are part of the scf acquisition pipeline.





\section{Subcategorization Frames in Biomedicine}
 \label{subcat}

Subcategorization refers to the tendency of a verb to ``select'' co-occurrence with particular sets of syntactic phrases, including noun phrases (NPs), prepositional phrases (PPs), subordinate clauses, and adjectives. Some 
% basic 
examples of subcategorization frames (SCFs) can be seen in
Table~\ref{t:basic}.\footnote{For the SCF names we use COMLEX Syntax notation
\citep{grishman:94}, which includes an abbreviation for each phrase type in the SCF. Thus the SCF for a transitive verb (taking one direct object noun phrase) is NP, and for a verb taking a direct object and a prepositional phrase NP-PP.  We do not specify the subject NP as part of the SCF, since subjects are obligatory in English.} Most verbs take several SCFs. 
% In Table~\ref{t:basic}, it can be seen that {\it decrease} may occur with the following SCFs: NP, NP-PP, or $\oslash$ (intransitive). On the other hand, {\it compare} occurs with the first two but not as an intransitive.

\begin{table}
\begin{tabular}{|l|p{0.8\columnwidth}|}
\hline
SCF & Example \\
\hline
\hline
NP & This physical, mental, or emotional tension of an individual \underline{decreases} [$_{NP}$the feeling of being in control]. \\
NP-PP & Heterozygosity for twine also \underline{decreases} [$_{NP}$the frequency of precocious NEB] [$_{PP}$to less than 10\%]. \\
PP-PP & The proportion of subjects with moderate-to-severe symptoms \underline{decreased} [$_{PP}$from 29.6\%] [$_{PP}$to 2.3\%].  \\
intransitive & In the control group SV \underline{decreased}. \\
\hline
% NP-PP & An aim of our experiment was to \underline{compare} [$_{NP}$IRFs] [$_{PP}$to results from previous fMRI studies]. \\
NP & We \underline{compared} [$_{NP}$the performance of the Charlson and the Elixhauser comorbidity measures]. \\
NP-PP & We \underline{compared} [$_{NP}$mandatory celiotomy] [$_{PP}$to laparoscopy]. \\
\hline
NP & A post hoc analysis \underline{revealed} [$_{NP}$a statistically significant relationship between timing of fondaparinux dose and bleeding]. \\
THAT-S & This observation \underline{revealed} [$_{S}$that systolic and diastolic BP increased during the interdialytic period]. \\
NP-TOBE & [$_{NP}$The incidence for cardiovascular events] was \underline{revealed} [$_{S}$to be 1.13\%]. \\
\hline
\end{tabular}
\caption{Sample SCFs for {\it decrease}, {\it compare}, and {\it reveal}. All examples adapted from the PMC OA corpus.}
\label{t:basic}
\end{table}

% Additional examples of SCFs are shown in
% Table~\ref{t:complex_scfs}. Here the COMLEX SCF names include
% mnemonics for some additional information beyond the simple phrasal
% types. For example, the frame NP-AS-NP is a subclass of NP-PP, where
% the preposition is lexicalized as {\it as}. The frame NP-TOBE
% represents a direct object and a predicate using {\it to be}. The
% frame THAT-S represents a sentential complement introduced by the
% complementizer {\it that}, and TO-INF is an infinitival complement that uses the {\it to} form of the verb in the lower clause.

% \begin{table}
% \begin{tabular}{|l|p{0.75\columnwidth}|}
% \hline
% SCF & Example \\
% \hline
% \hline
% NP-AS-NP & Perception of complex stimuli occurs too rapidly to \underline{support} rate coding as a reliable mechanism. \\
% NP-TOBE & The larger, unsaturated propyne group has been \underline{shown} to be a useful modification for antisense oligonucleotides. \\
% PP-PP & Threshold values \underline{ranged} from 0.01 to 0.99. \\
% THAT-S & Experiments with PTEN-null PGCs in culture \underline{revealed} that these cells had greater proliferative capacity.\\
% TO-INF & Administration of DA agonists to the rat PFC \underline{acts} to enhance working memory in these animals.\\
% \hline
% \end{tabular}
% \caption{Sample SCFs. All examples adapted from the PMC OA corpus.}
% \label{t:complex_scfs}
% \end{table}

In the traditional linguistic view of subcategorization, a distinction is made between {\it arguments} and {\it adjuncts}.  Arguments are phrases that obligatorily co-occur with the verb, while less closely associated modifiers such as location, manner, or temporal phrases are adjuncts and not part of the SCF.  
% SCFs are normally defined to include arguments only.\footnote{In reality, the argument-adjunct distinction is a gradient one. Adjuncts can be considered ``highly selected'' if they are more closely tied to the meaning of the verb.} It is well-known, however, that biomedicine uses a more inclusive view of
% subcategorization than more traditional linguistic studies, in which
% phrase types considered adjuncts in other approaches, such as
% location, manner, or temporal modifiers, are instead considered part
% of the SCF (or, in related work, PAS). This is because such
% information 
The biomedical NLP field, however, has adopted a more inclusive view of subcategorization, including adjuncts in the SCFs,
because the information they contain
is considered important for information extraction in
biomedicine \citep{cohen:06,wattarujeekrit:04}. 
We will refer to the traditional linguistic view as ``syntactic'' (i.e. grammar-level), since it emphasizes syntactic obligatoriness, and the biomedical
view as ``semantic'' (i.e. meaning-level), since it emphasizes semantic importance.
We observed in \cite{review} 
that the effects of these two views
on resource creation and automatic acquisition
% , which we call the ``syntactic'' and
% ``semantic'' views, 
have not been quantitatively evaluated, and 
in this paper we perform the first investigation of SCF
acquisition 
that explicitly compares the two definitions of subcategorization.




\section{Biomedical SCF Gold Standards}
    \label{gold}


% In this section we describe 
We have produced the first set of biomedical SCF gold standards,
% we produced for our experiments, and 
which we make available as a resource to the biomedical NLP community.
% our investigations into current SCF technology in biomedicine
% (Section~\ref{investigation_tech}) and the role of different
% definitions of subcategorization 
% (Section~\ref{investigation_subcat}). 
% To our knowledge, these are the
% first SCF gold standards 
% designed 
% for evaluation of SCF systems 
% which have been produced for biomedicine.
% To the best of our knowledge there are no previous gold standards
% available that are suitable 
% for evaluation of automatically-acquired SCF lexicons for biomedical text.  
% These resources have been produced by manually annotating 150-200 sentences randomly chosen from across the PMC OA for each verb in the gold standard. 
% Production of SCF gold standards by manual annotation of corpus data is important, since 
% resources based on introspection rather than systematic corpus annotation are not sufficiently comprehensive, due to many SCF types being missed, especially the more rare SCFs.
% % As a general
% % principle, manually developed resources tend not to be sufficiently
% % comprehensive in their SCF coverage to serve as gold standards, due to
% % the rarity of many SCF types, which may be missed during the
% % introspective process of resource creation.  
% For example, the majority
% of verbs considered in PASBio have just two attested frames, compared
% to 9 for general language verbs in the gold standard associated with
% VALEX, and 6 in the gold standards we produce here.  Importantly, resources built without the assistance of corpus data also lack the statistical information that is naturally gathered
% during automatic production and important for NLP systems.  
% % The BioLexicon, on the other hand, while produced from a
% % corpus, is unsuitable to be used as a gold standard because the output
% % has not been manually corrected. Moreover, the filtering used (a
% % relative frequency cutoff of 0.03), while suitable for removing noise
% % from a lexicon, is unsuitable for gold standard creation because many
% % SCFs are genuinely rare.
Each resource has been produced by selecting a set of representative verbs, then manually annotating 150-200 sentences randomly chosen from across the PMC OA for each verb, in order to provide broad coverage of multiple subdomains. Annotation of corpus data is crucial to avoid missing SCF types \cite{review}, as well as to gather statistical information about SCF frequency, which can be important for resource evaluation.

\subsection{SCF Inventory}
\label{scf_inventory}

For annotation of corpus data we chose to use the SCF inventory of \citep{preiss:07},
a rich, manually-developed inventory previously used for general language. It consists
of 163 SCFs obtained by manually merging the SCFs exemplified in
the COMLEX Syntax \citep{grishman:94} and ANLT \citep{boguraev:87}
dictionaries, along with 
% a small number of 
some
additional frames
identified by inspection of general language data.  We refer to this
inventory as the ``Cambridge inventory'' because it was developed at the University of Cambridge. Not all of the 163 possible SCFs in the inventory will be used for any given dataset; our gold standards used a total of 27 SCF types from the inventory.

\subsection{Annotation Tool}

A
% custom-built 
custom tool was developed and used for annotation. 
The tool 
% which 
highlighted the target verb in each sentence 
and allowed the annotator to select an SCF from a drop-down menu. The annotator could also customize responses to particular sentences, for example by flagging problematic examples, or adding comments for later reconciliation.
% , as
% well as add comments.  
A screen shot of the annotation tool is shown
in Figure~\ref{f:annointerface}. 

\begin{figure*}
\includegraphics[height=.4\textheight]{figures/bw_annotation.png}
\caption{Annotation interface.}
\label{f:annointerface}
\end{figure*}


\subsection{Semantic Gold Standard}
\label{semgold}

Our main gold standard contains 30 verbs annotated using the ``semantic''
definition of subcategorization 
% traditionally 
favored for
biomedicine (see Section~\ref{subcat}). We refer to this gold standard as SEM-30. The verbs were
chosen based on frequency\footnote{Verbs needed to be frequent enough
to ensure enough data to annotate for the gold standard, as well as
enough raw corpus data for the SCF acquisition system to produce a comprehensive
lexicon.}, occurrence across both biomedical and general
language text, and the fact that they are known to take multiple SCFs in
biomedical text. 
% Among such verbs, 
We also preferred verbs that we believed may
have developed specialized senses in biomedicine -- e.g. {\it
activate} -- since 
% this makes it more likely that they may have specialized SCFs as well.
specialized senses often correspond to specialized SCFs.
The first column of Table~\ref{t:verblist} shows the verbs in SEM-30.
%  (further verb sets represented in Table~\ref{t:verblist} will be introduced in Sections~\ref{syntactic_gold} and~\ref{investigation_tech}).
%  and all the gold standards, with
% SEM-30 on the left. In Section~\ref{investigation_tech} we will
% introduce SEM-26, which is a subset of SEM-30 consisting of the 26
% verbs in SEM-30 that also appear in the BioLexicon, to which we compare BioCat; the semantic gold standard is described below, and the overlap also.

\begin{table}
\begin{tabular}{|l|c|c|c|c|}
\hline
\multirow{3}{*}{verb} & \multirow{3}{*}{SEM-30} & \multirow{3}{*}{SEM-26} & SYN-10 & Overlap \\ 
 & & & and & with \\
 & & & SEM-10 & \citep{preiss:07} \\
\hline
\hline
activate & $\bullet$ & $\bullet$ & $\bullet$ &  \\
analy(z/s)e & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ \\
associate & $\bullet$ & $\bullet$ & $\bullet$ & \\
cause & $\bullet$ & $\bullet$ & & $\bullet$ \\
compare & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ \\
contain & $\bullet$ & $\bullet$ & & \\
decrease & $\bullet$ & $\bullet$ & $\bullet$ & \\
detect & $\bullet$ & $\bullet$ & & \\
develop & $\bullet$ & $\bullet$ & & \\
enhance & $\bullet$ & $\bullet$ & & \\
examine & $\bullet$ & & & \\
express & $\bullet$ & $\bullet$ & $\bullet$ & \\
fail & $\bullet$ & & & \\
follow & $\bullet$ & $\bullet$ & & \\
generate & $\bullet$ & $\bullet$ & & \\
improve & $\bullet$ & $\bullet$ & $\bullet$ & \\
increase & $\bullet$ & $\bullet$ & & \\
induce & $\bullet$ & $\bullet$ & & $\bullet$ \\
inhibit & $\bullet$ & $\bullet$ & & \\
modify & $\bullet$ & $\bullet$ & & \\
mutate & $\bullet$ & $\bullet$ & $\bullet$ & \\
occur & $\bullet$ & $\bullet$ & $\bullet$ & \\
perform & $\bullet$ & & & \\
predict & $\bullet$ & & $\bullet$ & \\
produce & $\bullet$ & $\bullet$ & & $\bullet$ \\
recogni(z/s)e & $\bullet$ & $\bullet$ & & \\
reduce & $\bullet$ & $\bullet$ & & \\
regulate & $\bullet$ & $\bullet$ & & \\
transcribe & $\bullet$ & $\bullet$ & & \\
treat & $\bullet$ & $\bullet$ & & \\
\hline
\end{tabular}
\caption{Verbs in the gold standards SEM-30 (full gold standard, Section~\ref{semgold}), SEM-26 (overlap of SEM-30 with verbs in the BioLexicon, Section~\ref{semgold}), SYN-10 and SEM-10 (comparative syntactic and semantic gold standards, Section~\ref{syntactic_gold}), and those that overlap with the general language gold standard of \citep{preiss:07} (Section~\ref{investigation_subcat}).}
\label{t:verblist}
\end{table}

% Sentences for annotation were randomly selected from across all
% subdomains of PMC OA. The annotator annotated betwen 100-250 
% sentences per verb. 

% For the semantic annotation, 
The 
annotator, a biomedical NLP expert, was instructed to include in the SCF 
all phrases attached to the verbal head which were important for
biomedicine, and also aimed for similarity with 
% guided by 
the semantic role types in PropBank \citep{propbank}, a corpus of verbal propositions and their arguments.

% One question we had prior to annotation was 
Prior to beginning the annotation we did not know
whether the Cambridge inventory, developed for general language, would be appropriate for biomedical text. We found that it was; during
annotation of SEM-30, only 20 sentences, or about 0.3\% of the 6,473
total annotated sentences, were identified by the annotator as
involving an SCF not included the Cambridge inventory. Since the number of examples was so small, 
we chose to discard these sentences rather than modify the
inventory.

The SEM-30 gold standard, which includes SCFs and their relative frequencies for each verb, 
 was derived directly from the annotations.
A sample entry from SEM-30 for the verb {\it transcribe} is shown in
Figure~\ref{f:sample-gs}. 
We will also refer to SEM-26, which is simply a subset of SEM-30 used for comparative evaluation. The second column of Table~\ref{t:verblist} shows the verbs in SEM-26.
We used SEM-30 and SEM-26 to evaluate SCF acquisition systems (see Sections~\ref{investigation_tech}, \ref{comparative}, \ref{results_cambiolex}, \ref{results_comparative}).

\begin{figure}
\begin{tabular}{|lr|}
% \centering
\hline
\multicolumn{2}{|c|}{transcribe}\\
NP & 0.719424 \\
NP-PP & 0.215827 \\
NP-as-NP & 0.021583 \\
NP-PP-PP & 0.014388 \\
PP & 0.014388 \\
INTRANS & 0.007194 \\
ADVP & 0.007194 \\
\hline
\end{tabular}
\caption{Sample gold standard entry for {\it transcribe} from SEM-30. Column 1 shows the SCF and column 2 shows the relative frequency across sentences annotated for the gold standard.}
\label{f:sample-gs}
\end{figure}

\subsection{Syntactic Gold Standard}
\label{syntactic_gold}

In order to investigate the difference between the semantic and syntactic
annotation styles, we chose ten verbs from SEM-30 to annotate according to
the syntactic definition of subcategorization. 
% For this comparison, 
We
% inspected the corpus data and 
chose verbs that appeared to occur in the corpus with
a relatively large number of highly selected adjuncts, making them
more likely to exhibit variation 
% in subcategorization behavior 
across
the two definitions of subcategorization.  We refer to this gold standard as
SYN-10, 
and the same verbs annotated with the semantic definition as SEM-10 (i.e. SEM-10 is a subset of SEM-30).
% and the corresponding set of semantically annotated 
% verbs SEM-10.  
The third colum of Table~\ref{t:verblist} shows the verbs in
SYN-10 and SEM-10.

A 
second 
annotator, a linguistics expert, performed the syntactic annotation for SYN-10, and was given different guidelines from those given to the semantic annotator.\footnote{We could have attempted to rule out an effect of
  different individuals doing the annotation by asking both annotators
  to annotate the data twice, once with each set of guidelines, but
  the time and effort required would have been prohibitive.} 
The
syntactic
 annotator was instructed to use the
traditional criterion of optionality 
to distinguish arguments from adjuncts.
% when deciding on the SCF.  
We again used our
annotation tool (Figure~\ref{f:annointerface}).
% , and the
% annotator again annotated between 100-250 instances of each verb.  



We used the Cambridge inventory of SCFs for syntactic as well as
semantic annotation. However, the syntactic and semantic
interpretations 
% for any given SCF in the inventory may differ. 
differ for the same SCFs. For
example, according to the syntactic 
% annotation guidelines, 
definition, the frame NP-ADVP would only be used
for certain obligatory adverbs such as {\it there} as in the sentence {\it She put $[_{NP}$it$]$
$[_{ADVP}$there$]$}. Under the semantic annotation, the use of this frame would be extended, e.g.~for
adverbs such as {\it significantly} or {\it normally}.

% As examples of sentences that the annotators treated differently,
% consider 
Sentences \ref{ex:activate-1} and \ref{ex:activate-2} are examples of sentences
that the annotators treated differently. In
\ref{ex:activate-1}, the semantic annotator chose the frame NP-ADVP,
treating the adverb {\it significantly} as part of the SCF, while the
syntactic annotator chose the simple transitive frame NP.
%  (the NP is {\it transcription}). 
In \ref{ex:activate-2}, the semantic annotator
chose the frame NP-PP, treating the PP {\it through the canonical and
noncanonical pathways} as part of the SCF, while the syntactic
annotator again chose NP.
%  (the NP is {\it NF-kB}). In both cases the
% syntactic annotator was judging by what was syntactically obligatory,
% and the semantic annotator by what was important to an understanding
% of the event.

\ex.\label{ex:activate-1} The p53 mutant, which contains a disabled DNA-binding domain, does not \underline{activate} $[_{NP}$transcription$]$ $[_{ADVP}$significantly$]$.

\ex.\label{ex:activate-2} Both receptors \underline{activate} $[_{NP}$NF-kB$]$ $[_{PP}$through the canonical and noncanonical pathways$]$, with RANK specifically requiring TRAF6.


As with SEM-30, the SYN-10 gold standard was derived directly from the annotations.
Any
sentence which had not been annotated by both annotators (due to
differences in opinion about whether the sentence was a valid
instance) was discarded. 
% The total number of sentences for each verb
% in SYN-10 and SEM-10 
% can be seen in Table~\ref{t:agreement} in Section~\ref{investigation_subcat}. 
SYN-10 was used to measure differences between the semantic and syntactic annotation approaches, both by direct comparison of the resulting gold standards (Sections~\ref{investigation_subcat}, \ref{results_direct}) and investigation of how the annotation approach affected the evaluation of an acquired SCF lexicon (Sections~\ref{eval_sem}, \ref{results_sem}).

% \subsection{Release of Gold Standards}

% Along with this article we release the gold standards.
% The archive contains the full SEM-30 and SYN-10.  Users can of course create subsets of SEM-30 as desired. Each file has 30 or 10 verbs (respectively) with relative frequencies of SCF in lines of the format ``SCF COUNT''. Singletons have been removed from the annotation. (???)


\section{BioCat: A New Subcategorization Resource for Biomedicine}
\label{subcat_system}

There are few existing SCF resources for biomedicine, and those that
exist tend to rely on manual development or domain-specific tools. The
new resource we present here, BioCat, takes a different appraoch. It
uses a set of tools developed for SCF acquisition in general language
which are, except for the POS tagger, 
% unlexicalized and therefore
domain-independent\footnote{The tools are domain-independent because they are unlexicalized; that is, they have no information about specific words, only general information about English. For example, the parser has no verb-specific information about the likelihood of different syntactic arguments. See \cite{review}.}
, and applies these tools to a large biomedical
corpus.

% We introduce here a new resource.  According to \cite{review}, the parts of a subcategorization acquisition system are: preprocess and parse, classify, filter.

We produced BioCat using an updated version of the tools in
\cite{preiss:07}, which we will refer to as the Cambridge system, or
Cambridge tools.\footnote{The updating consisted of a more recent
  unpublished version of the SCF classifier, which re-implemented the
  original classifier rules in a different programming language.} 
An SCF acquisition system in general consists of hypothesis generation (pre-processing, parsing, and identifying potential SCFs using a classifier) followed by filtering (see \cite{review} for an overview).
In
the Cambridge system an input corpus is first parsed with the RASP system
\cite{briscoe:06}. A classifier consisting of manually-defined rules
then matches the RASP output to the SCFs in the Cambridge inventory
(Section~\ref{gold}), and the resulting lexicon is filtered. 
% Note that this SCF system was designed for general laguage and consists only of general-language tools, and is not adapted to biomedical at all, but applied to a large biomedical corpus.




\subsection{Subcategorization Acquisition System}

For pre-processing and parsing we used RASP, a modular statistical parsing system which
includes a tokenizer, tagger, lemmatizer, and a wide-coverage
unification-based tag-sequence parser. We used the standard scripts
supplied with RASP to output the set of grammatical relations (GRs)
for the most probable analysis returned by the parser or, in the case
of parse failures, the GRs for the most likely sequence of
subanalyses. 
RASP is an unlexicalized parser, meaning that
it does not have access to a lexicon of information about the behavior
of specific words (as opposed to classes of words, e.g.~words with particular part-of-speech tags), and thus does not
already embody a notion of subcategorization.\footnote{To test the influence of different parsers, we
performed an experiment using output of the unlexicalized Stanford
parser \citep{klein:03} as input to the subcategorization steps and found that accuracy was the same on the SCF evaluation as for RASP.}
To identify potential SCFs, 
% In the Cambridge system, 
a rule-based classifier incrementally matches GRs with 
% the corresponding 
SCFs.
The rule set was an updated version of that used in \citep{preiss:07}; note that it was developed for general language and not adapted for biomedical text.
From the classifier output, preliminary lexical entries are
constructed for each verb, containing the raw and relative frequencies
of SCFs found for each verb in the data. Finally, the entries are
filtered to obtain a more accurate lexicon. 

\subsection{Filtering}

We used two filtering methods.  The first method was simple relative
frequency filtering.  Here, an empirically
determined minimum threshold is set on the relative frequencies of SCFs, so that only SCFs with per-verb relative frequencies above the threshold are retained.
% filtering out SCFs whose relative frequency for a given verb is lower
% than the threshold. 
This simple method has been shown to yield more
accurate results than more complex statistical hypothesis tests
\citep{korhonen:02}. 
Previous work on SCF acquisition for general language using similar
SCF systems found a threshold of 0.02 to give the most accurate
results. In development experiments on held-out data we found 0.02 and 0.03 to give
the most accurate results under different conditions, and we chose to
use a threshold of 0.03 to match the threshold used by the Biolexicon (see Section~\ref{investigation_tech}).

Second, we used a novel method which we call SCF-specific
filtering. The intuition behind this method is that the appropriate
reliability threshold for each SCF may be different, since some SCFs
are inherently much more frequent than others.  We did not have
information about the overall frequency of the different SCFs in
biomedical text, so we used information about their overall frequency
in general language from the COMLEX and ANLT dictionaries, along with
empirical information about high and low frequencies from the
unfiltered lexicon acquired for biomedicine, to set a specific
threshold for each SCF.  
% Although this method gives more accurate
% results overall, 
We tested this method to see if it could improve accuracy, even though
it uses information about general language which may
or may not be applicable to the biomedical domain.


\subsection{Input Corpus}

For our corpus we used the PubMed Open Access Subset (PMC OA), which
is the largest publicly available corpus of full-text articles in the
biomedical domain \cite{PMC:09}.  PMC OA comprises 169,338 articles
drawn from 1,233 medical journals indexed by the Medline citation
database, totalling approximately 400 million words.  Articles are
formatted according to a standard XML tag set \cite{PMC:XML:09}. 
% The
% National Institute of Health (NIH) maintains a one-to-many mapping
% from journals to 122 subdomains of biomedicine \cite{PMC:Subjects:09}.
% The mapping covers about a third of the PMC OA journals, but these
% account for over 70\% of the total data by word count.  Journals are
% assigned up to five subdomains, with the majority assigned one (69\%)
% or two (26\%). 
We used the same dataset as \cite{Lippincott:2011,review},
composed of journals that are assigned to a single subdomain, and
discarding subdomains with less than one million words of data. The
resulting dataset contains a total of 342 journals in 37 biomedical
subdomains.
% , with Genetics and Medical Informatics being the largest,
% and Complementary Therapies and Ethics the
% smallest. See \cite{Lippincott:2011} for the distribution
% of PMC OA data by subdomain.
 It has been shown that the open access collection is representative of the broader biomedical literature \citep{Verspoor:EtAl:09}.



% \subsection{Release of BioCat Lexicon}

% Along with this article we release BioCat, the lexicon obtained by
% using the Cambridge tools on the PMC OA data.
% The archive contains a file for each subdomain we studied, as well as
% an ``overall'' file for the entire PMC OA data set (note that the
% latter is more than just the union of the former, since not all
% journals are assigned a subdomain).  Each file has raw counts of each
% verb/SCF combination in lines of the format ``VERB SCF COUNT''.  These
% counts are unfiltered, and so include low-occurence verb-SCF combinations: while this 
% is useful for research purposes, practical
% use of the resource in NLP applications will likely require filtering 
% appropriate to the task at hand.  The file ``subcat\_frames.xml'' describes the SCF
% inventory used in this study, with cross-references to other common
% inventories (e.g.~COMLEX, the BioLexicon, etc).


\section{Evaluation Methods}
\label{methods}

We performed a number of experiments designed to evaluate current SCF technology and the definition of subcategorization.

\subsection{Evaluation of BioCat}
  \label{investigation_tech}

The purpose of this experiment was to evaluate our new resource, BioCat, against a biomedical SCF gold standard, SEM-30.
% evaluate how well an SCF system
% designed for general language and consisting only of general-language
% tools, applied to a biomedical corpus, could perform against a
% biomedical SCF gold standard.
%   Our first experiment is an evaluation
% of current SCF technology in the field of biomedicine, using our new
% gold standard.  
% Using SEM-30 as the gold standard, 
We evaluated two versions of BioCat, corresponding to the two
filtering methods described in Section~\ref{subcat_system}.
% : relative
% frequency filtering, and SCF-specific filtering, which relies on prior
% knowledge about general language SCF distributions. 
We used standard evaluation measures from previous SCF evaluations
for general language \citep{korhonen:02,preiss:07}, namely type
precision, type recall, and F-score (the harmonic mean of precision
and recall). We noted the number of SCFs present in the gold standard but missing from the filtered output, i.e. not just missing for a particular verb but missing altogether, as a way of evaluating the coverage of the SCF system. We also noted the number of gold standard SCFs unseen in
the {\it unfiltered} system output; that is, false negatives which were not
detected at all by the classifier even before filtering.
% (rather than being filtered out).

\subsection{Comparative evaluation of BioCat and the BioLexicon}
\label{comparative}

The purpose of this experiment was to perform a comparative evaluation
of BioCat against another SCF resource, the BioLexicon
\cite{sasaki:08,biolexicon,biolexicon_new}. BioCat was built using
unadapted, general language tools applied to a multi-subdomain
biomedical corpus; while the BioLexicon was built using tools adapted
to a single biomedical subdomain, applied to data also drawn from that
subdomain. 

Both lexicons 
% represent 
result from 
state-of-the-art approaches to biomedical SCF
acquisition; and given the impracticability of manually adapting NLP
tools to every subdomain, both approaches are natural, with BioCat having a potential advantage from the wide coverage of its source data, and BioLexicon having a potential advantage from domain adaptation. This
experiment tests how the two approaches perform against an SCF gold
standard drawn from a wide variety of subdomains (see \cite{review} for further discussion).

% Both lexicons
% represent state-of-the-art approaches to biomedical SCF acquisition,
% but neither is perfectly suited to an SCF gold standard drawn from
% across a wider variety of subdomains, so the comparison is a
% particularly interesting one (see \cite{review}).  

BioCat was
described in Section~\ref{subcat_system}; here we give a brief
description of the BioLexicon, followed by a description of the
mapping between the two lexicons.

% Next we performed a comparative evaluation of BioCat against another existing resource. Our intention here was to see how a system trained using bio-specific tools, but only on a single subdomain, could perform against a gold standard constructed from a wider variety of subdomains. This comparison is illuminating because the two resources are produced using very different approaches. As discused in Section~\ref{subcat_system}, BioCat is unadapted. On the other hand, the BioLexicon was built using tools adapted for one subdomain of biomedicine.

% We chose two
% different automatically acquired SCF lexicons to evaluate against
% SEM-30: BioCat, the new lexicon described in
% Section~\ref{subcat_system}, and the BioLexicon \cite{biolexicon}.
% The Biolexicon was built using tools adapted to a subdomain of
% biomedicine.  BioCat was built using tools developed for general
% language \cite{briscoe:97,preiss:07}, and we have adapted them only by
% applying them to a biomedical rather than a general domain corpus.


\subsubsection{The BioLexicon}

The BioLexicon \citep{sasaki:08,biolexicon,biolexicon_new} is currently the only biomedical NLP
resource containing an automatically constructed SCF lexicon. It
% includes
is built on
 data from the E. Coli subdomain, and each component used in
acquisition of the lexicon -- for example, the part-of-speech tagger,
named entity recognizer, and parser -- 
% has been
was  manually adapted to
the subdomain of molecular biology. 

To create the BioLexicon, six million words of MEDLINE E. Coli
abstracts and articles were parsed with the Enju deep
parser \citep{enju}, which was adapted to the biomedical domain as
described in \citep{hara:06}, using a variety of external resources such as GENIA \citep{Kim:EtAl:03}.  Unlike RASP, Enju is a lexicalized parser, which means that it already contains a notion of subcategorization, which can be adapted to different domains.  No SCF inventory was assumed in advance; rather,
the set of grammatical relations for each verb instance was considered
as a potential SCF.  Potential SCFs were filtered using simple relative frequency filtering, at a
threshold of 0.03, leading to
% , i.e. for any given verb, all SCFs with a relative frequency less than 0.03 are discarded. 
% Filtering leads to 
an 
inventory of 136 SCFs.  Further arguments and strongly-selected adjuncts were
chosen according to their log-likelihood with respect to the verb.

% Moreover, 
% the parsing model
% used in SCF discovery
% is lexicalized, with a built-in notion of subcategorization, and is
% tuned for biomedical data using a variety of external resources such
% as GENIA \citep{Kim:EtAl:03}.  While there are immediate benefits to these
% approaches in terms of accuracy in SCF acquisition within the same
% % % % domain as the training data, the model's reliance on manual annotation is costly, 
% % and its preconception of subcategorization may introduce
% % bias against new subdomain behaviors.
% and it may perform less well across all the subdomains.

% The BOOTStrep BioLexicon \citep{biolexicon} contains automatically-produced verb subcategorization data.  

%   It
% is important to note that the BioLexicon draws on a single subdomain of
% biomedical literature.  

The BioLexicon is 
% publicly 
available through ELRA\footnote{\url{http://catalog.elra.info}.}. 
We used the BioLexicon exactly as provided without additional training or
adaptation.   No evaluation is
provided with the BioLexicon which would reveal how well the acquisition
technology performs on E.Coli or on general biomedical corpus data, so our experiment represents the first such evaluation. We used SEM-26 for the evaluation, since four verbs in SEM-30 were not included in the BioLexicon.

\subsubsection{Mapping between lexicons}
\label{mapping}

% To better understand the strengths and weaknesses of the BioLexicon and 
% BioCat,
% we evaluated them against the SEM-30 and SEM-26 gold standards
% (see 
% Section~\ref{gold}).

% Second, we performed a comparative evaluation of 
% BioCat
% with the BioLexicon. 

% Since the BioLexicon includes only 26 of the 30 verbs in SEM-30, we
% used SEM-26 for the evaluation, a subset of SEM-30 consisting of those
% 26 verbs (see second column of Table~\ref{t:verblist}).

Performing a comparative evaluation was not straightforward, since the mapping
between the BioLexicon SCF inventory and the Cambridge inventory, used for the gold standard and BioCat, is
many-to-many.\footnote{The Cambridge inventory makes some more fine-grained linguistic distinctions, whereas the BioLexicon has more lexicalized elements in the SCFs.}
% Recall that the gold standard uses the Cambridge inventory. 
% We used
% two mapping methods. First, ``best match'' manually chose the single
% best SCF for each BioLexicon SCF and mapped it to the Cambridge
% inventory to match the gold standard. 
We first used a ``best match'' mapping in which we manually selected the closest match in the Cambridge inventory for each BioLexicon SCF. This mapping resulted in a set of 22 SCF types for the BioLexicon.\footnote{This is far lower than the 97 SCF types reported for the BioLexicon in \citep{biolexicon:2008} (we found 136 SCF types when querying the BioLexicon). However, since the BioLexicon inventory differentiates SCFs containing PPs based on the lexicalized preposition, many SCFs were collapsed during the mapping: PP-{\it from}, PP-{\it to}, PP-{\it of}, etc. would all map to PP in the Cambridge inventory. For comparison, recall from Section~\ref{scf_inventory} that the SEM-26 gold standard contained 27 SCF types from the Cambridge inventory.} The mapped BioLexicon and BioCat in its native format were both evaluated directly on SEM-26.

To make sure the mapping did not penalize the BioLexicon, we also used a ``coarse'' mapping which represented a common denominator between the Cambridge and BioLexicon inventories.
We
semi-manually created equivalence classes of SCFs such that both
inventories could distinguish the classes from one another, and these classes became the SCFs in a new, coarse-grained inventory containing 14 broad SCFs. We mapped BioCat, the BioLexicon, and the gold standard to this coarse inventory.\footnote{To aid future experimentation, the ``best match'' mapping and the resources mapped to the coarse inventory are included in the public release of materials accompanying this paper.}
% and performed an evaluation.
% The resulting coarse-grained inventory contained 14 broad SCFs. We
% evaluated both BioCat
% and the BioLexicon against a version of the gold
% standard which had been mapped to this coarser inventory. We again
% report type precision, type recall, F-score, and missing SCFs.
% Recall that the BioLexicon inventory is induced from the
% parsed corpus, whereas the Cambridge inventory is pre-defined. Each
% inventory is more fine-grained in certain areas. For example, the
% Cambridge inventory includes multiple frames for various constructions
% that are distinguished in linguistics, such as predicate nominals
% ({\it He seemed a fool}, where {\it fool} is predicated of {\it He})
% as opposed to direct object nominals ({\it He saw a fool}). On the
% other hand, the BioLexicon inventory differentiates SCFs with PP
% arguments according to preposition, so that NP-PP-{\it through},
% NP-PP-{\it from}, and NP-PP-{\it for}, etc. are different frames,
% while the Cambridge inventory has only two frames with an NP-PP
% configuration.  Since SEM-26 is annotated using the Cambridge
% inventory, we had two options: map the BioLexicon inventory to the
% Cambridge inventory and evaluate the BioLexicon directly on SEM-26, or
% modify SEM-26 to use a common intermediate representation and map both
% inventories to this representation for evaluation. We used both these options.
% We call the manual mapping from the BioLexicon inventory to the Cambridge inventory
% ``best match''. Here we manually examined each SCF in the BioLexicon
% inventory and chose which single SCF in the Cambridge inventory it was most
% likely to correspond to. Following the example above, the BioLexicon
% frame NP-PP-{\it through}, NP-PP-{\it from}, etc. would map to the Cambridge
% frame NP-PP. Similarly, the BioLexicon frame NP-PP-{\it into}-PP-{\it on}
% would map to the Cambridge frame NP-PP-PP.  This process
% resulted in a set of 22 SCF types for the BioLexicon. This is far lower
% than the 97 SCF types reported in \citep{biolexicon:2008}\footnote{We found 136 unique SCF types when querying the BioLexicon.}, 
% since we collapse the SCFs that are lexicalized for preposition.
% After performing the ``best match'' mapping, we evaluated the
% BioLexicon and 
% BioCat
% directly on SEM-26. 

A simple
relative frequency threshold of 0.03 was used for filtering in both
the BioLexicon and 
BioCat.\footnote{We did not use SCF-specific filtering in this experiment since it was not available for the Biolexicon.}
% Although we have demonstrated that general language statistics can be
% successfully used for SCF-specific filtering in biomedicine, 
% We used relative
% frequency thresholds for the comparative evaluation (i.e. not the SCF-specific filtering available for BioCat) since it provides
% a level playing field for the BioLexicon and BioCat.
% For the coarse mapping, we had a coarse version of SEM-26.
We report type precision, type recall, and F-score against SEM-26 and the coarse gold standard. We also
report the number of SCFs missing from the filtered lexicon,
% Note that an SCF may be missing from the filtered lexicon either because it was not in the lexicon at all, or because it had low frequency and was filtered out.\footnote{We do not report SCFs unseen in the unfiltered lexicon here, because we did not have access to the unfiltered BioLexicon.}
but not the number of SCFs unseen in the unfiltered lexicon, because we did not have access to the unfiltered BioLexicon.

% Because of the many-to-many nature of the mapping, we were concerned
% that the ``best match'' mapping might be unfair to the BioLexicon,
% since it only captured one SCF in the Cambridge inventory for each SCF
% in the BioLexicon, even though there might be more than one legitimate
% choice. Therefore, we also pursued another method to handle the
% different SCF inventories.

% We created from our gold standard a new gold standard with a much
% coarser-grained SCF inventory. We did this by semi-manually creating
% equivalence classes of SCFs based on types that both the BioLexicon
% and Cambridge inventory could differentiate. First, we expanded the
% best match by manually defining a more ``inclusive'' match, listing
% all the Cambridge inventory SCFs which could be a match to a
% BioLexicon SCF (accounting for the one-to-many aspect of the
% BioLexicon-Cambridge inventory mapping). Then we created a bipartite
% graph in which one set of nodes represented the Cambridge SCFs and the
% other set represented the BioLexicon SCFs. Edges represented
% ``inclusive'' mapping rules. Each connected component was then considered a
% coarse SCF. We call this mapping to coarse-grained SCFs semi-manual,
% because the inclusive mapping rules were manually defined, but the
% equivalence classes were found automatically.



\subsection{Direct comparison of semantic and syntactic annotation}
    \label{investigation_subcat}

The purpose of this experiment was to investigate the semantic and
syntactic definitions of subcategorization, by direct comparison of
the manually annotated gold standards resulting from the two
approaches. 

We compared SYN-10 and SEM-10 using the kappa measure
\citep{cohen:60}. Kappa is typically used to measure inter-annotator
agreement, in the case when multiple annotators perform the same task on the same
data.
% , and serves as a measure of the difficulty of the task and the
% reliability of the human annotation. 
% However, in this case, we do not
% have a typical inter-annotator agreement scenario, since 
However, in our case
the two
annotators were given different instructions, so, kappa 
measures the difference between the two {\it methods} of annotation,
corresponding to the two definitions of subcategorization.

We also compared the number of SCFs per verb in SEM-10 versus SYN-10, to check whether the semantic style of annotation produces a wider variety of SCFs.
% as a measure of XXX.  We also compared the same number across all of
% SEM-30, and in the general language gold standard of \cite{preiss:07}
% [ALL OF IT OR JUST THE OVERLAP??]
Since SYN-10/SEM-10 verbs were chosen for their higher number of adjuncts, this measure might over-represent the number of SCFs found in semantic annotation, so we also compared the number of SCFs per verb in all of SEM-30. Finally, we compared these values with the number of SCFs per verb in the general language gold standard of \cite{preiss:07}, for those verbs in SEM-30 also appearing in \cite{preiss:07} (see Table~\ref{t:verblist}, rightmost column).

% Our third investigation looked at the notion of subcategorization in
% biomedicine, and how it affects manual and automatic development of
% lexical resources.  In Section~\ref{subcat} we introduced two
% definitions of subcategorization: the ``syntactic'' definition defines
% verbal arguments as those syntactic phrases which occur obligatorily
% with a verb, while the ``semantic'' definition defines verbal
% arguments as those syntactic phrases which are important for a full
% description of a biomedical event. Section~\ref{gold} describes how we manually annotated a set of
% sentences using both definitions, resulting in the SYN-10 and SEM-10 gold standards. In this section we describe how we used these resources to 
% make comparisons
% between the two definitions 
% of subcategorization,
% and also to investigate the effects on accuracy of an automatic SCF acquisition
% system.
% Both of these comparisons are novel in the literature.

% We compared SEM-30, SEM-10,
% and the general language gold standard of
% \citep{preiss:07}; see Table~\ref{t:gs_stats}, using various measures. 


\subsection{Evaluation of BioCat using SEM-10 and SYN-10}
\label{eval_sem}

The purpose of this experiment was to investigate how the 
definition of subcategorization used in the gold standard affects the
perceived accuracy of an automatically acquired SCF lexicon. We used
BioCat for this experiment since its inventory matches that of the
gold standards.
% , so there is no mapping problem. 
We evaluated BioCat
directly on SYN-10 and SEM-10.

\section{Results and Discussion}
\label{results}

\subsection{Evaluation of BioCat}
\label{results_cambiolex}

\begin{table}[]
\begin{tabular}{| l | r | r | r | r | r |}
  \hline
  Filtering     &  F-score & Prec & Rec & Missing \\ % & Unseen\\
  \hline
  \hline
  0.03 thresh & 44.96 & 39.37 & 52.41 & 13 \\ % & 0\\
  SCF-specific  & 59.94   & 60.87     & 59.04 & 11 \\ % & 0 \\
  \hline
\end{tabular}
\caption{Accuracy of BioCat on SEM-30. Missing SCFs were missing altogether from the filtered lexicon.}
% ; unseen SCFs were missing from the unfiltered lexicon.}
\label{t:mainresult}
\end{table}

The accuracy of BioCat on SEM-30 is shown in Table~\ref{t:mainresult}.
With relative frequency filtering, the system achieves an overall
F-score of about 45, with recall favored over precision. Using
SCF-specific filtering, the system achieves an F-score of nearly 60
with precision slightly favored over recall. This improvement
demonstrates that knowledge about general language SCFs can be useful
for filtering in biomedicine. The number of missing SCFs also
decreases slightly when using SCF-specific filtering, indicating that
this filtering method is more successful at retaining SCFs which are
rare but correct. We note that no SCFs were completely
unseen in the {\it unfiltered} lexicon, meaning that the system is capable
of finding all the SCFs in the gold standard.

The result for SCF-specific filtering is about 9 points lower than
state of the art methods for general language, e.g.~\citep{preiss:07}.
It is a respectable result considering that no adaptations were made
to the SCF acquisition system besides applying it to a large
biomedical corpus, but it does show that there is a need for
adaptation to the biomedical domain.

\subsection{Comparative Evaluation of BioCat and the BioLexicon}
\label{results_comparative}

\begin{table}
\begin{tabular}{| l | r | r | r | r | }
  \hline
  Lexicon             & F-score & Prec & Rec & Missing \\
  \hline
  \hline
  BioCat    & 46.20   & 40.00     & 54.68 & 11\\
  BioLexicon          & 58.37   & 87.14     & 43.88 & 20 \\
  \hline
\end{tabular}
\caption{Accuracy of BioCat (threshold 0.03) and the BioLexicon, using best-match, on SEM-26.}
\label{t:best-match}.
\end{table}


\begin{figure}
\begin{tabular}{|p{0.9\columnwidth}|}
\hline
%  NP-ING-SC:\\ 
 NP-ING:\\ 
This study indicates that all treatment protocols seemed to be sufficiently effective and safe and that [$_{NP}$cheyletiellosis in rabbits] can be successfully \underline{treated} [$_{ING}$using ivermectin or selamectin] in clinical practice.\\[2pt]
While the AT immunologic activity is normal in this deficiency, [$_{NP}$plasma AT functional activity] is markedly \underline{reduced} [$_{ING}$leading to risk of thrombosis].\\[5pt]
NP-PP-PP:\\ 
{[}$_{NP}$ The constitutive TRPV1t activity] is \underline{inhibited} [$_{PP}$to baseline] [$_{PP}$in the presence of SB].\\[2pt]
Unlike the Tetrahymena ribozyme, [$_{NP}$the changes] \underline{induced} [$_{PP}$in precursor RNA] by incubation [$_{PP}$in the absence of divalent cations] result in activation of the ribozyme.\\
\hline
\end{tabular}
\caption{Examples of SCFs in SEM-26 and BioCat but missing from the BioLexicon.}
\label{f:missing-fine}
\end{figure}

% This phenomenon is dose-dependently \underline{inhibited} by leukotriene receptor antagonism with FPL 55712, SK\&F 104353 and montelukast.\\[2pt]
% STZ inhibited hOGA competitively with Ki = 64 ± 3 μM with 3 min incubation, and there is no change in the dose-response curve for the longer time periods of incubation.



The accuracy of BioCat
and the BioLexicon on SEM-26 using ``best match'' (our first strategy for mapping the disparate SCF inventories of the two resources; Section~\ref{mapping}) is shown
in Table~\ref{t:best-match}.\footnote{Note that the figures for BioCat differ from those in the first row of Table~\ref{t:mainresult} because they
are for only 26 verbs.}
We can see that the BioLexicon has a much higher
F-score than BioCat even though it uses simple relative frequency filtering, approaching the F-score achieved
by 
BioCat
with SCF-specific filtering. Interestingly, we can also
see that the BioLexicon strongly favors precision over recall, while 
BioCat
is stronger on recall. The high precision of the BioLexicon is a result of the fact that it is produced with a deep, lexicalized parser already
adapted to the biomedical domain, including a POS tagger trained on biomedical text. 
% This means that the output of the parsing stage already took 
The input to the SCF classifier thus already takes
into account some subcategorization information specific to the biomedical domain. This results in a high-precision system for biomedical text, but relies on up-front domain adaptation, whereas the Cambridge system is less precise but can be ported to new domains as long as there is a large corpus of raw data available. 

The higher recall of BioCat likely reflects the
fact that it is built from across PMC OA, while the BioLexicon is based on only a single subdomain of
biomedicine.
% , while BioCat is built from across PMC OA. 
% It can be seen that 
The Cambridge system is able to hypothesize SCFs which are likely to be important for interpretation of the text; the trade-off, however, is that the Cambridge system hypothesizes more frames overall, resulting in relatively low precision. This may be overcome in the future, however, with more sophisticated filtering methods, as suggested by the results in Section~\ref{results_cambiolex}.



Figure~\ref{f:missing-fine} shows examples of SCFs found in the SEM-26 gold standard and in BioCat but not in the Biolexicon. Such frames are potentially important for information extraction, demonstrating the importance of recall in SCF acquisition.
% The importance of higher recall for information extraction can be
% seen when we look at the SCFs in SEM-26 which are not included in the BioLexicon. The sentences
% in Figure~\ref{f:missing-fine} are examples of frames which appear in 
% BioCat
% but not in the BioLexicon. 
Note that the BioLexicon may include these frames for other verbs, but at least for the verbs in SEM-26 they were either filtered out or not present to begin with.



\begin{table}
\begin{tabular}{| l | r | r | r | r |}
  \hline
  Lexicon        &     F-score & Prec & Rec & Missing\\
  \hline
  \hline
  BioCat   & 65.38   & 55.43     & 79.69 & 2\\
  BioLexicon  & 69.23   & 90.00     & 56.25 & 4\\
  \hline
\end{tabular}
\caption{Accuracy of BioCat (threshold 0.03) and the BioLexicon using coarse-grained inventory, on SEM-26.}
\label{t:coarse}
\end{table}

The accuracy of BioCat and the BioLexicon using the 
coarse-grained inventory (our second strategy for mapping SCF inventories; Section~\ref{mapping}) is shown in
Table~\ref{t:coarse}. As expected, both lexicons show higher accuracy
when evaluated using this more forgiving inventory. The same general trends 
% found earlier
still hold, however, with the BioLexicon favoring precision while 
BioCat
favors recall.

Note that even using the coarse-grained SCF inventory, the BioLexicon is
missing more SCFs from the filtered lexicon than 
BioCat.
Figure~\ref{f:missing-coarse} shows examples of coarse-grained 
frames that were in the coarse-grained gold standard and BioCat, but missing
from the BioLexicon.

\begin{figure}
\begin{tabular}{|p{0.9\columnwidth}|}
\hline
THAT-S:\\ 
Additionally, our image analysis allowed us to \underline{detect} [$_{S}$that FTG mice also ventured further into the open arm compared to FNTG controls].\\[2pt]
All the caregivers \underline{expressed} [$_{S}$that the feeling of safety for the patient and the caregiver was essential], emphasizing that professional back-up 24 hours a day was important.\\[5pt]
ING:\\ 
All of these stimuli \underline{activated} [$_{ING}$signaling] through the MAP kinase/ERK pathway and led to the induction of P-YB-1S102.\\[2pt]
Although none of the mutations \underline{increased} [$_{ING}$binding] to the same degree as removing the entire USH, they had little effect on the solubility of the protein compared to removal of the entire USH.\\

\hline
\end{tabular}
\caption{Examples of SCFs in the coarse-grained version of SEM-26 and BioCat but missing from the BioLexicon.}
\label{f:missing-coarse}
\end{figure}


% A number of SCF acquisition systems have been developed for general
% language (usually newswire)
% text \citep{korhonen:02,valex,preiss:07}. Very good accuracy has been
% obtained, although the best results use sophisticated methods such as
% smoothing the SCF distributions smoothing based on the semantic
% classes of the verbs \cite{korhonen:02}.  


\subsection{Direct comparison of semantic and syntactic annotation}
\label{results_direct}

\begin{table}
\begin{tabular}{| l | l | l |}
  \hline
  verb & Kappa score & instances\\
  \hline
  \hline
  activate & 0.204022 & 152 \\
  analy(s$|$z)e & 0.214015 & 227 \\
  associate & 0.803061 & 203 \\
  compare & 0.390602 & 224 \\
  decrease & 0.498399 & 173 \\
  express & 0.479512 & 223 \\
  improve & 0.619959 & 239 \\
  mutate & 0.548926 & 108 \\
  occur & 0.044539 & 242 \\
  predict & 0.311750 & 172 \\
  \hline
  overall & 0.586751 & 1963 \\
  \hline
\end{tabular}
\caption{Agreement between methods using instructions for syntactic and semantic gold standards.}
\label{t:agreement}
\end{table}



Table~\ref{t:agreement} shows the results of the kappa agreement test.
 The overall kappa was 0.58, well below the
0.67 threshold which is considered a minimum for moderate agreement
on NLP annotation tasks \citep{krippendorff:80}. The low kappa indicates that the definition of subcategorization has a significant effect on how the resulting gold standards will look.
% , indicating the
% definitions have an effect on the resulting gold
% standards. 
The kappa for some verbs was well below 0.5.\footnote{Recall, however, that the ten
verbs in SYN-10 and SEM-10 were chosen in part for their large number
of adjuncts, so the agreement between the syntactic and semantic
methods of annotation might be lower for this set of verbs than for others
in SEM-30.}

\begin{table}
\begin{tabular}{|l|c|c|c|c|}
\hline
 & SEM-10 & SYN-10 & SEM-30 & Gen \citep{preiss:07} \\
\hline
\hline
Low &  4 & 3 & 1 & 1 \\
High & 10 & 9 & 10 & 25 \\
Avg & 6.6 & 5.9 & 5.4 & 9.4 \\
\hline
\end{tabular}
\caption{Number of SCFs per verb in the different gold standards.}
\label{t:gs_stats}
\end{table}



The average number of SCFs per verb in the different gold standards is shown in Table~\ref{t:gs_stats}. SEM-10 had an average of 6.6 SCFs
per verb, ranging from a low of 4 to a high of 10; while SYN-10 had an
average of 5.9 SCFs per verb, ranging from a low of 3 to a high of
9. This observation suggests that the syntactic method results in
annotating slightly fewer frames per verb than the semantic method,
which is not surprising since the semantic method takes into account
a broader range of phrases that the syntactic method might consider
adjuncts. However, the difference is not large.

The average number of SCFs per verb is slightly higher for SEM-10 than
SEM-30, which may reflect the fact that the ten verbs were manually
selected based on their large number of adjuncts.
% , which may lead to
% additional frames. 
More interestingly, the average and the
maximum number of SCFs per verb is much lower for SEM-30 than for the
general language gold standard of \cite{preiss:07}.  This observation suggests that verb
usage becomes specialized in biomedical text, with the range of
SCFs being only a limited subset of those observed in general
language. Interestingly, this was the case even though the semantic
definition of subcategorization was used for SEM-30, and the syntactic
for the general language gold standard.


\subsection{Evaluation of BioCat using SEM-10 and SYN-10}
\label{results_sem}

Table~\ref{t:syngs} shows the results of evaluating BioCat against SEM-10 and SYN-10.   Interestingly,
BioCat
is more accurate on SEM-10 than SYN-10, despite the fact that it uses
 syntactic information (parser output) as the input to
hypothesis generation. This reflects the fact that the Cambridge
system hypothesizes a wide variety of phrases as parts of the SCFs,
including some that are considered adjuncts by the linguistic definition of subcategorization, but not by the biomedical definition.

Note that the F-score for SEM-10 is lower than for the full set of 30 verbs in SEM-30 (Table~\ref{t:mainresult}); precision in particular is much lower. This is
because the small number of verbs provides insufficient evidence
across SCFs for the SCF-specific filtering to perform at its best
(although it still slightly out-performs threshold filtering).

\begin{table}
\begin{tabular}{| l | r | r | r |}
  \hline
  Gold standard & F-score & Prec & Rec \\
  \hline
  \hline
  Semantic      & 53.19   & 40.98     & 75.76 \\
  Syntactic     & 47.67   & 36.28     & 69.49 \\
  \hline
\end{tabular}
\caption{Accuracy of BioCat (with SCF-specific filtering) on semantic and syntactic gold standards, for the ten verbs in the syntactic gold standard.}
\label{t:syngs}
\end{table}


\section{Conclusions}
\label{conclusions}

Our study has provided some insights into the current state of verb
subcategorization frame acquisition for biomedicine.  We have made available the first set of biomedical SCF gold standards suitable for performing quantitative and qualitative evaluation of automatically acquired biomedical SCF resources. Using the Cambridge SCF acquisition system, which was 
% unadapted 
not specifically adapted
for biomedicine but applied to a large biomedical
corpus, we acquired BioCat, a biomedical SCF lexicon which achieved reasonable results using simple relative
frequency filtering.
% threshold similar to previously reported optimum
% thresholds. 
A new method of SCF-specific filtering was found to offer
improved accuracy even though it depended on SCF frequency information
from general language. Still, SCF acquisition performance drops off considerably
compared to general language, losing more than 10 points on F-score,
indicating that there is room for adaptation of SCF systems to
biomedicine.

We compared two biomedical SCF lexicons, each representing a different
aspect of the state of the art in SCF acquisition. We found that the
BioLexicon, built with a SCF acquisition system in which each
component has been adapted to biomedical text using manually annotated
data in the molecular biology subdomain, favored precision over recall
when evaluated against our SCF gold standard drawn from across PMC
OA. On the other hand, BioCat, built using a state of the art system
for general language SCF acquisition and unadapted to biomedical text
save for the input corpus, favored recall over precision. 
% Neither type
% of system yields high enough overall performance, and 
The contrast
between the two highlights the need for techniques
that can acquire SCF information from a broader range of subdomains.

Overall, it can be seen that the accuracy of both BioCat and the
BioLexicon against a biomedical gold standard is lower than for
general language SCF acquisition against general language SCF gold standards \cite{korhonen:02,valex,preiss:07}.
We believe 
the lower accuracy
arises from different sources for the two
lexicons. 
BioCat is insufficiently adapted to biomedical text, and hypothesizes a
wide variety of SCFs inappropriate for the domain, resulting in low precision. The BioLexicon, on the other hand, suffers from lower recall, which may mean that
a system whose components
have been manually adapted 
to a single subdomain does not generalize well enough to the
variety of
subdomains in PMC OA.
New methods for biomedical SCF acquisition are clearly needed
in order to create accurate, scalable SCF lexicons
to help with downstream NLP tasks, but an approach which
relies heavily on manual work will not port easily between 
different domains. New, minimally supervised SCF acquisition methods such as \cite{lippincott:12} have recently become available and can be used for acquiring domain- and subdomain-specific SCF lexicons. In addition, 
some of the best results on SCF acquisition for general language
have used information about verb semantic classes to smooth
conditional SCF distributions \citep{korhonen:02}, based on the
linguistic fact that semantically similar verbs tend to have
syntactically similar behavior. This avenue needs further
exploration in biomedicine. Incorporating word sense disambiguation
may also improve accuracy and understanding of subcategorization in
biomedicine, especially since 
% we see that 
verb behavior in different
subdomains may involve overlays of general and specialized senses.


% We suggest the use of less-supervised approaches
% for domain adaptation in SCF acquisition.
% (see the discussion in Section~\ref{conclusions}.

% Next, 
We observed that using two different definitions of
subcategorization -- the ``semantic'' definition, which collapses the
argument-adjunct distinction, and the ``syntactic'' definition, which
retains it -- results in very different styles of annotation, and
therefore different evaluation results for an SCF system depending on
the definition used in the gold standard.  Interestingly, because the
Cambridge system,
based on a sophisticated SCF inventory,
 readily hypothesizes many phrase types co-occurring
with verbs as part of the SCF, it is more consistent with the semantic
definition of subcategorization and achieved higher accuracy on the
semantic gold standard than the syntactic one. This behavior may or
may not be desirable depending on the application, but needs to be
taken into consideration.

% Our results suggest several drawbacks to current methods of SCF
% acquisition for biomedicine.  First, differences in the definition of
% subcategorization, and consequently different SCF inventories and gold
% standards, make performance comparisons difficult.  Second, the large
% performance drop when applying an unadapted system to biomedical text
% demonstrates the need for domain adaptation. 

% Although direct evaluation of SCF acquisition is important, it could be supplemented
% with task-based evaluation which uses the output of a system to augment performance 
% on a downstream task that is easier to assess \citep{vlachos:2011}.
% For example, an unlexicalized parser or relationship extractor could be augmented with SCF
% probabilities, and then re-evaluated to determine improvement.  In this setup, the definition of subcategorization and the SCF inventories used by each system would not need to be reconciled: the candidate parses would simply be reranked based on the new probabilities from the lexicon. 
%Some promising results in this direction have already been obtained for general language \cite{Carroll:98}.
% The reference above is just too old.

% By decoupling evaluation from a particular definition and inventory, unsupervised or 
% minimally-supervised SCF acquisition methods, such as those based on clustering and graphical models, could be developed and evaluated alongside the current supervised and rule-based methods.
% Unsupervised approaches have a particular advantage in domain adaptation, since they do not
% rely on manually created resources and because their definitions and inventories emerge from their 
% domain-specific input data. Ideally, such approaches would also involve moving away from features that require manual domain-adaptation for optimal performance (such as parser output), to shallower and more robust features like parts-of-speech or phrase chunking.  There are a range of semi-supervised methods between these
% extremes, such as self-training and hybrid graphical modeling \citep{zhu:2006}, which may help yield optimal performance while minimising the need for manual annotation.  An interesting area for future work is determining an optimal middle ground.

% Some of the best results on SCF acquisition for general
% language have used information about verb semantic classes to smooth
% conditional SCF distributions \citep{korhonen:02}, based on the
% linguistic fact that semantically similar verbs tend to have
% syntactically similar behavior. This avenue needs further exploration
% in biomedicine. Incorporating word sense disambiguation may also
% improve accuracy and understanding of subcategorization in
% biomedicine, especially since we see that verb behavior in different
% subdomains may involve overlays of general and 
% specialized
% senses \cite{review}.

Finally, we make the new resources we have created and presented in this article, including our 
different gold standards and the large BioCat lexicon, publicly available
so that they can can benefit further research in this area\footnote{\url{http://www.cl.cam.ac.uk/~tl318/BioCat.tgz}.}. 

%These can be found at \url{http://www.cl.cam.ac.uk/~tl318/BioCat.tgz}.


\section*{Acknowledgments}

This work was funded by the EU FP7 project `PANACEA', EPSRC (UK) grant
EP/G051070/1, and the Royal Society (UK). We gratefully acknowledge
Yuval Krymolowski for programming the SCF-specific filtering, and
Diane Nicholls for syntactic annotation.


% \section{Theory/Calculations}

%
%% The Appendices part is started with the command \appendix;
%% appendix sections are then done as normal sections
%% \appendix

%% \section{}
%% \label{}

% \section{Appendices}



%% References
%%
%% Following citation commands can be used in the body text:
%% Usage of \cite is as follows:
%%   \cite{key}          ==>>  [#]
%%   \cite[chap. 2]{key} ==>>  [#, chap. 2]
%%   \citet{key}         ==>>  Author [#]

%% References with bibTeX database:

%\bibliographystyle{model3-num-names}
\bibliographystyle{plain}
\bibliography{jbmi_bib}

%% Authors are advised to submit their bibtex database files. They are
%% requested to list a bibtex style file in the manuscript if they do
%% not want to use model3-num-names.bst.

%% References without bibTeX database:

% \begin{thebibliography}{00}

%% \bibitem must have the following form:
%%   \bibitem{key}...
%%

% \bibitem{}

% \end{thebibliography}


\end{document}

%%
%% End of file `elsarticle-template-3-num.tex'.

